## Ambulance Data related to acute cerebrovascular accidents
brain_data <- read_csv("data/raw/brain_data.csv") %>%
mutate(across(c(Year, Season, Month, DayOff, COVID, Sudden_Start, Dst_level), as.factor))
## Ambulance Long Data related to acute cerebrovascular accidents
brain_data_long <- read_csv("data/raw/brain_data_long.csv") %>%
mutate(across(c(Year, Season, Month, DayOff, COVID, Sudden_Start, Dst_level, Sex, Age), as.factor))
Our dataset contains a large number of quantitative variables:
# Data preprocessing
brain_data_clear <- brain_data %>%
select( where(is.numeric) & !ends_with(c("min", "max", "var", "K_Sum")) ) %>%
filter(complete.cases(.))
# Function for displaying correlations with a neutral color for values close to 0
color_correlation_fixed <- function(data, mapping, ...) {
x <- eval_data_col(data, mapping$x)
y <- eval_data_col(data, mapping$y)
cor_value <- cor(x, y, use = "complete.obs")
# Defining color
color <- if (abs(cor_value) < 0.1) {
scales::alpha("gray50", 0.5) ## Gray for weak correlations
} else if (cor_value > 0) {
scales::alpha("blue", abs(cor_value)) ## Blue for positive correlations
} else {
scales::alpha("red", abs(cor_value)) ## Red for negative correlations
}
# Filter for small correlation values
cor_label <- ifelse(abs(cor_value) < 0.01, "0", sprintf("%.2f", cor_value))
ggplot(data.frame()) +
annotate("text", x = 0.5, y = 0.5, label = cor_label, size = 10, color = color ) +
theme_void()
}
# Function for trend line with a golden color
golden_smooth <- function(data, mapping, ...) {
ggplot(data, mapping) +
geom_point(alpha = 0.4, size = 0.5, color = "black") +
geom_smooth(color = "gold", ...) +
theme_custom
}
# Plot construction
ggpairs(
brain_data_clear,
progress = FALSE,
upper = list(continuous = color_correlation_fixed), ## Highlighting correlations
lower = list(continuous = golden_smooth) )+ ## Golden trend line
theme_custom +
theme(
axis.text.x = element_text(size = 10),
axis.text.y = element_text(size = 10) )
When analyzing the network graph, three groups of features can be identified: - Climatic: temperature, pressure, wind, precipitation; - Geomagnetic: Dst index, Ap index; - Solar: F10.7 index, number of sunspots.
cor_data <- brain_data %>%
select( where(is.numeric) & !ends_with(c("min", "max", "Sum", "var")) ) %>%
rename(Temperature = Temp_mean, Precipitation = H2O,
`F10.7 index` = F10.7, `Ap index` = Ap_mean, `Dst index` = Dst_mean) %>%
cor(use = "pairwise.complete.obs")
corrplot(cor_data, method = 'number', type = 'lower', diag = FALSE)
network_plot(cor_data, min_cor = .1)
Additionally, some features within the groups exhibit a high degree of correlation! - A negative correlation was expected between the daily average temperature and pressure (r = -0.73). - The F10.7 index and the number of sunspots show a high correlation (r = 0.92), as both reflect solar activity but are measured using different methods. - The daily average Ap and Dst indices demonstrate a significant correlation (r = -0.70), as both reflect changes in the Earth’s geomagnetic field. However, the distribution of data along the trend line remains uneven.
brain_data %>%
drop_na() %>%
ggplot(aes(Temp_mean, Pressure))+
geom_jitter(alpha = 0.5, colour = "royalblue2", size = 1.5)+
geom_smooth(method = "lm", se = FALSE, colour = "orangered2", linewidth = 1.2) +
stat_cor(method = "pearson", label.x.npc = 0.6, label.y.npc = 0.9, size = 7)+
labs(x = "Temperature")+
theme_custom
brain_data %>%
ggplot(aes(Sunspots, F10.7))+
geom_jitter(alpha = 0.5, colour = "royalblue2", size = 1.5)+
geom_smooth(method = "lm", se = FALSE, colour = "orangered2", linewidth = 1.2) +
stat_cor(method = "pearson", label.x.npc = 0, label.y.npc = 0.9, size = 7)+
labs(x = "Sunspot Number", y = "F10.7 index")+
theme_custom
brain_data %>%
ggplot(aes(Dst_mean, Ap_mean))+
geom_jitter(alpha = 0.5, colour = "royalblue2", size = 1.5)+
geom_smooth(method = "lm", se = FALSE, colour = "orangered2", linewidth = 1.2) +
stat_cor(method = "pearson", label.x.npc = 0.6, label.y.npc = 0.9, size = 7)+
labs(x = "Daily Average Dst index", y = "Daily Average Ap index")+
theme_custom
Based on
further analysis, the following variables will be excluded: -
Pressure: does not provide full coverage of the data
throughout the study period and exhibits less variability compared to
Temperature. - F10.7 Index: has an incomplete data set
and exhibits less variability compared to Sunspot count. - The
Ap and Dst geomagnetic activity indices will not be
excluded.
However, all the variables considered do not have a noticeable relationship with the number of ambulance calls:
brain_data %>%
rename(Temperature = Temp_mean, `Ap index` = Ap_mean, `Dst index` = Dst_mean) %>%
pivot_longer(cols = c(Temperature, Sunspots, `Ap index`, `Dst index`)) %>%
ggplot(aes(x = value, y = EMS))+
geom_point()+
geom_smooth(se = FALSE, method = "gam", colour = "goldenrod2", linewidth = 2)+
theme_custom +
labs(x = "", y = "Daily Emergency Calls") +
facet_wrap(~name, scales = "free")
The number of sunspots is directly linked to the solar activity cycles, as confirmed by the graph:
brain_data %>%
select(!c(Pressure, F10.7, Ap_mean)) %>%
ggplot(aes(Date, Sunspots))+
geom_line(alpha = 0.3, colour = "royalblue2")+
geom_smooth(method = "gam", formula = y ~ s(x, k = 10), se = FALSE,
colour = "orangered4", linewidth = 2) +
geom_vline(xintercept = as.Date("2009-01-01"),
colour = "orangered", linewidth = 1, linetype = "dashed")+
geom_vline(xintercept = as.Date("2019-12-31"),
colour = "orangered", linewidth = 1, linetype = "dashed")+
scale_x_date(date_breaks = "2 year", date_labels = "%Y")+
labs(x = "", y = "Sunspot number")+
theme_custom
Our data covers the complete 24th solar cycle, spanning the period from December 2008 to December 2019.
Working with time series allows us to decompose the data to identify trends and cyclic patterns:
ggarrange(
brain_data %>% filter(Date >= ymd("2007-01-01")) %>% pull(Sunspots) %>%
ts(start = 2007, frequency = 365) %>% stl(s.window = "periodic") %>%
autoplot() + theme_custom + ggtitle("Sunspots"),
brain_data %>% filter(Date >= ymd("2007-01-01")) %>% pull(EMS) %>%
ts(start = 2007, frequency = 365) %>% stl(s.window = "periodic") %>%
autoplot() + theme_custom + ggtitle("EMS"),
brain_data %>% filter(Date >= ymd("2007-01-01")) %>% pull(Temp_mean) %>%
ts(start = 2007, frequency = 365) %>% stl(s.window = "periodic") %>%
autoplot() + theme_custom + ggtitle("Temperature"),
nrow = 1
)
ggarrange(
brain_data %>% filter(Date >= ymd("2007-01-01")) %>% pull(Sunspots) %>%
ts(start = 2007, frequency = 365) %>% stl(s.window = "periodic") %>%
autoplot() + theme_custom + ggtitle("Sunspots"),
brain_data %>% filter(Date >= ymd("2007-01-01")) %>% pull(Ap_mean) %>%
ts(start = 2007, frequency = 365) %>% stl(s.window = "periodic") %>%
autoplot() + theme_custom + ggtitle("Ap Index"),
brain_data %>% filter(Date >= ymd("2007-01-01")) %>% pull(Dst_var) %>%
ts(start = 2007, frequency = 365) %>% stl(s.window = "periodic") %>%
autoplot() + theme_custom + ggtitle("Dst amplitude"),
nrow = 1
)
- After
decomposing the emergency calls data, a linear increase
over time is observed. However, since 2020, a noticeable decrease is
apparent, likely associated with the COVID-19 pandemic. - Climatic data
(especially temperature), after decomposition, show a
clear cyclic pattern. - The daily Dst index does not
show any clear trends or cycles. However, the daily fluctuation
amplitude of the Dst index, after decomposition,
exhibits a trend similar to the dynamics of sunspot numbers.
Nevertheless, since noise is the dominant component, the results should
not be taken too seriously.
ggarrange(
brain_data %>%
filter(Date %within% interval(ymd("2007-01-01"), ymd("2019-12-31"))) %>%
group_by(Year, Month) %>%
summarise(EMS_month = sum(EMS, na.rm = TRUE), .groups = "drop") %>%
ggplot(aes(x = Month, y = Year, fill = EMS_month)) +
geom_tile(colour = "black") +
scale_fill_gradient(low = "green", high = "red") +
scale_x_discrete(labels = month.abb) +
labs(x = "", y = "", fill = "", title = "Total Number of Emergency Calls")+
theme_custom,
brain_data %>%
filter(Date %within% interval(ymd("2007-01-01"), ymd("2019-12-31"))) %>%
group_by(Year, Month) %>%
summarise(Sunspots_month = sum(Sunspots, na.rm = TRUE), .groups = "drop") %>%
ggplot(aes(x = Month, y = Year, fill = Sunspots_month)) +
geom_tile(colour = "black") +
scale_fill_gradient(low = "green", high = "red") +
scale_x_discrete(labels = month.abb) +
labs(x = "", y = "", fill = "", title = "Total Number of Sunspots")+
theme_custom
)
brain_data %>%
filter(Dst_level != "Severe") %>%
mutate(Dst_level = fct_relevel(Dst_level, c("Quiet", "Weak", "Moderate", "Major", "Severe"))) %>%
ggplot(aes(Dst_level, Sunspots))+
geom_jitter(alpha = 0.5, size = 1, width = 0.2) +
geom_boxplot(alpha = 0.5, outliers = FALSE, fill = "goldenrod2", linewidth = 1)+
labs( x = "Geomagnetic activity level", y = "Sunspot Number" ) +
theme_custom
mean_values <- brain_data %>%
group_by(DayOff) %>%
summarise(mean_EMS = round(mean(EMS, na.rm = TRUE), 2), .groups = "drop")
ggplot(brain_data, aes(x = DayOff, y = EMS)) +
geom_boxplot(aes(fill = DayOff)) +
geom_text(data = mean_values,
aes(x = DayOff, y = mean_EMS, label = mean_EMS),
color = "black", size = 8, fontface = "bold", vjust = -1) +
scale_fill_brewer(palette = "Dark2", direction = -1)+
scale_y_continuous(limits = c(NA, max(brain_data$EMS) + 3))+
labs(y = "Daily Emergency Calls", x = "Non-working Days")+
theme_custom +
theme(legend.position = "none")+
stat_pvalue_manual(t_test(data = brain_data, EMS ~ DayOff),
label = "T-test, p = {p}",
size = 10,
y.position = max(brain_data$EMS) + 0.1)
- The
difference between the average EMS values for weekdays and weekends is
statistically significant. - The average EMS value is higher on weekdays
(13.33) compared to weekends (11.02).
brain_data %>%
mutate(Season = fct_relevel(Season, c("Winter", "Spring", "Summer", "Autumn"))) %>%
ggplot(aes(x = Season, y = EMS, fill = Season)) +
geom_boxplot(alpha = 0.7, show.legend = FALSE) +
scale_fill_manual(
values = c("Winter" = "dodgerblue2", "Spring" = "palegreen2", "Summer" = "tan2","Autumn" = "tomato2")) +
labs(y = "Daily Emergency Calls", x = "")+
theme_custom
brain_data %>%
pairwise_t_test(EMS ~ Season, p.adjust.method = "holm") %>%
transmute(`Group 1` = group1, `Group 2` = group2,
`p-value` = p, `p.adjusted` = p.adj) %>%
flextable()
Group 1 | Group 2 | p-value | p.adjusted |
|---|---|---|---|
Autumn | Spring | 0.0162 | 0.0969 |
Autumn | Summer | 0.1300 | 0.5200 |
Spring | Summer | 0.3670 | 1.0000 |
Autumn | Winter | 0.0259 | 0.1290 |
Spring | Winter | 0.8650 | 1.0000 |
Summer | Winter | 0.4670 | 1.0000 |
The distribution of emergency medical service calls across seasons is statistically insignificant (paired t-test with Holm p-value adjustment method: p.adjusted > 0.05).
brain_data %>%
drop_na(Kp_Sum, K_Sum) %>%
ggplot(aes(Kp_Sum, K_Sum))+
geom_jitter(alpha = 0.5, colour = "royalblue2", size = 1.5)+
geom_smooth(method = "lm", se = FALSE, colour = "orangered2", linewidth = 1.2) +
stat_cor(method = "pearson", label.x.npc = 0, label.y.npc = 0.9, size = 7)+
labs(x = "Daily Average Kp index", y = "Daily Average K index (IRT)")+
theme_custom
brain_data %>%
drop_na(Kp_Sum, K_Sum) %>%
pivot_longer(cols = c(Kp_Sum, K_Sum), names_to = "index") %>%
ggplot() +
geom_histogram(aes(x = value, fill = index),
binwidth = 5, colour = "black", alpha = 0.5, position = "identity", show.legend = FALSE) +
scale_fill_manual(
values = c("K_Sum" = "darkseagreen2", "Kp_Sum" = "steelblue2") )+
scale_x_continuous(breaks = seq(0, 50, 10))+
labs( x = "Index Values", y = "Observation Count", fill = "Index Type") +
theme_custom +
facet_wrap(~index, labeller = labeller(index = c("K_Sum" = "Daily Average K index (IRT)",
"Kp_Sum" = "Daily Average Kp index")))
Daily Average Kp
index and Daily Average K index (IRT) show a strong correlation
(r = 0.91). However, K index (IRT) has a smaller range
and a more uniform distribution compared to Kp index.
ggarrange(
brain_data_long %>%
filter(between(as.numeric(as.character(Year)), 2007, 2021), Age != "55+") %>%
group_by(Year, Sex, Age) %>%
summarise(EMS_total = sum(EMS, na.rm = TRUE), .groups = "drop") %>%
mutate(EMS_plot = ifelse(Sex == "Male", -EMS_total, EMS_total)) %>%
ggplot(aes(x = Year, y = EMS_plot, fill = Sex)) +
geom_bar(stat = "identity", alpha = 0.7, colour = "black", show.legend = FALSE) +
scale_y_continuous(labels = abs, n.breaks = 10) +
scale_fill_manual(
values = c("Male" = "steelblue", "Female" = "pink"),
labels = c("Male" = "Male", "Female" = "Female") ) +
coord_flip() +
labs(
title = "Ambulance Calls Distribution by Year, Age and Gender",
x = "", y = "", fill = "Gender") +
theme_custom +
facet_wrap(. ~ Age, scales = "free_x",
labeller = labeller(Age = c("25-44" = "Age: 25-44 years",
"45-54" = "Age: 45-54 years"))),
brain_data_long %>%
filter(between(as.numeric(as.character(Year)), 2007, 2021), Age == "55+") %>%
group_by(Year, Sex, Age) %>%
summarise(EMS_total = sum(EMS, na.rm = TRUE), .groups = "drop") %>%
mutate(EMS_plot = ifelse(Sex == "Male", -EMS_total, EMS_total)) %>%
ggplot(aes(x = Year, y = EMS_plot, fill = Sex)) +
geom_bar(stat = "identity", alpha = 0.7, colour = "black", ) +
scale_y_continuous(labels = abs, n.breaks = 10) +
scale_fill_manual(
values = c("Male" = "steelblue", "Female" = "pink"),
labels = c("Male" = "Male", "Female" = "Female") ) +
coord_flip() +
labs(
x = "", y = "Total EMS", fill = "Gender") +
theme_custom + theme(legend.position = "inside", legend.justification = c(0.9, 0.2))+
facet_wrap(. ~ Age, scales = "free_x",
labeller = labeller(Age = c("55+" = "Age: 55+ years"))),
nrow = 2
)